################################################################
#Script: Tables_DBP.r
################################################################
#Project: Gov 2001 Project; Do Businesses Pay to Do Science?; Extension of Giuri P. and Mariani M., "When Distance Disappears", 2013
#Script Goal: Create tables associated with project
#Inputs: 
#- Data environment created from 'Analysis_DBP.R'
#Outputs: Analysis results used to calculate tables
###############################################################

#install.packages("stargazer")
library(stargazer)
library(foreign)
library(pastecs)
library(cem)
library(MatchingFrontier)
library(MatchIt)
library(Zelig)
library(ZeligChoice)

rm(list=ls())
root<-"C:/Users/sdb/Dropbox/When Distance Disappears/R Files"
setwd<-root
tempDir <- "WIP"
if (file.exists(tempDir)){
} else {
  dir.create(file.path(root, tempDir))
}

df.main <- read.dta("C:/Users/sdb/Dropbox/When Distance Disappears/Complete data for replication/data_complete.dta")
exclusion.varlist <- c("CE_dummy2","DE_dummy2","EDU","year_first_patent","residence_degree","no_pastcoinv","link_coinv_diff_nuts_share1_b","link_diff_nuts_share1_b","lEmployees_miss","D_miss_empl","Rdint","D_missC_RD","PRI_Applic","Individual_Applic","Age","Male","UniMasDegree","PhDDegree","YearMobilitybefore","Ninventors","SK_ScLit","reasonCommExploit","reasonLic","reasonImit","lgdppop_nuts3","larea_nuts3_km2","lav_pat_nuts3_9496","top1_tc","DE_dummy2","IT","ES","NL","AppYear1994","AppYear1995","AppYear1996","AppYear1997","AppYear1998","TechClass")
#Find varlist locations in column names
exclusion.varlist.locations = which(colnames(df.main) %in% exclusion.varlist)
#Drop all obersvations which are missing data in those variables
df.balanced <- df.main[complete.cases(df.main[,c(exclusion.varlist.locations)]),]

###############################################
##Table1 - Summary of Original Data
##############################################
#"Dependent variables","Inventor characteristics","Applicant controls","Patent controls","Region and other controls","Country","Year"
rowname<-c("Near","Far",
           "Age","Male","High School Degree","University Degree","PhD Degree","Far Past Coinventors","Experience","Science","Country of Degree Dummy","Mobility in Region","Mobility out Region","Experience Herfindahl","Previous Patents Dummy","Inventor Past Patents","Conference",
           "log Employees","Employees","R&D Intensity",
           "Number of Inventors","Commercial Expltitation","Licensing","Prevent Imitation",
           "GDPPC","Pop","Area","Region Patents","TOP1% in Technology","NR Research Universities","Research University Score"
           )
table1.varlist <- c("CE_dummy2","DE_dummy2",
                    "Age","Male","Low_High_School","UniMasDegree","PhDDegree","link_coinv_diff_nuts_share1_b","year_first_patent","SK_ScLit","residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out","herfinv1","zeroExp","experience","SK_Tech",
                    "lEmployeesMiss","EmployeesMiss","RDint",
                    "Ninventors","reasonCommExploit","reasonLic","reasonImit",
                    "gdppop_nuts3","pop_nuts3","area_nuts3_km2","av_pat_nuts3_9496","top1_tc","shangai_n_univ","overall_score")

table1<-df.balanced[table1.varlist]
names(table1)<-rowname

table1.1.stat<-as.data.frame(t((stat.desc(table1))))[,c(9,13)]

#for subset w/no_pastcoinv==1
summary_stats <- stat.desc(subset(df.balanced,no_pastcoinv==1))
table1.1.stat["Far Past Coinventors",]<-summary_stats[c(9,13),"link_coinv_diff_nuts_share1_b"]

#for subset w/zeroExp==1
summary_stats <- stat.desc(subset(df.balanced,zeroExp==1))
table1.1.stat["Experience Herfindahl",]<-summary_stats[c(9,13),"herfinv1"]

#for subset w/D_miss_empl==0
summary_stats <- stat.desc(subset(df.balanced,D_miss_empl==0))
table1.1.stat["Employees",]<-summary_stats[c(9,13),"EmployeesMiss"]

#for subset w/D_missC_RD==0
summary_stats <- stat.desc(subset(df.balanced,D_missC_RD==0))
table1.1.stat["R&D Intensity",]<-summary_stats[c(9,13),"RDint"]

###############################################
##Table1.2 - Summary of Imputed Data
##############################################


##Read in data from multiply imputed files, and prep for creation into table
outdata<-list(read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata1.csv"))
outdata[[2]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata2.csv")
outdata[[3]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata3.csv")
outdata[[4]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata4.csv")
outdata[[5]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata5.csv")
outdata[[6]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata6.csv")
outdata[[7]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata7.csv")
outdata[[8]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata8.csv")
outdata[[9]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata9.csv")
outdata[[10]]<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/ImputedData/outdata10.csv")


## Recalculate key variables
for(i in 1:10){
  data<-outdata[[i]]
## Recalculate dummy variables utilized in analysis
data$CE_dummy2<-0
data$CE_dummy2[data$CloseExt>0]<-1
data$DE_dummy2<-0
data$DE_dummy2[data$DistExt>0]<-1
data$Low_High_School<-0
data$Low_High_School[data$EDU==1]<-1
data$UniMasDegree<-0
data$UniMasDegree[data$EDU==2]<-1
data$PhDDegree<-0
data$PhDDegree[data$EDU==3]<-1
data$AppYear1993<-0
data$AppYear1993[data$AppYear==1993]<-1
data$AppYear1994<-0
data$AppYear1994[data$AppYear==1994]<-1
data$AppYear1995<-0
data$AppYear1995[data$AppYear==1995]<-1
data$AppYear1996<-0
data$AppYear1996[data$AppYear==1996]<-1
data$AppYear1997<-0
data$AppYear1997[data$AppYear==1997]<-1
data$AppYear1998<-0
data$AppYear1998[data$AppYear==1998]<-1

exclusion.varlist.locations = which(colnames(data) %in% exclusion.varlist)
data.balanced <- data[complete.cases(data[,c(exclusion.varlist.locations)]),]
outdata[[i]]<-data.balanced
}


table1.2<-list()
table1.2.stat<-list()
table1.2.varlist <- c("CE_dummy2","DE_dummy2",
                      "Age","Male","Low_High_School","UniMasDegree","PhDDegree","link_coinv_diff_nuts_share1_b","year_first_patent","SK_ScLit","residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out","herfinv1","zeroExp","lexperience","SK_Tech",
                      "lEmployeesMiss","EmployeesMiss","RDint",
                      "Ninventors","reasonCommExploit","reasonLic","reasonImit",
                      "gdppop_nuts3","pop_nuts3","area_nuts3_km2","av_pat_nuts3_9496","top1_tc","shangai_n_univ","overall_score")
rowname1.2<-c("Near","Far",
              "Age","Male","High School Degree","University Degree","PhD Degree","Far Past Coinventors","Experience","Science","Country of Degree Dummy","Mobility in Region","Mobility out Region","Experience Herfindahl","Previous Patents Dummy","log Inventor Past Patents","Conference",
              "log Employees","Employees","R&D Intensity",
              "Number of Inventors","Commercial Expltitation","Licensing","Prevent Imitation",
              "GDPPC","Pop","Area","Region Patents","TOP1% in Technology","NR Research Universities","Research University Score")
for(i in 1:10)
{data<-outdata[[i]]
table1.2[[i]]<-data[table1.2.varlist]
names(table1.2[[i]])<-rowname1.2
table1.2.stat[[i]]<-as.data.frame(t((stat.desc(table1.2[[i]]))))[,c(9,13)]
}
average.table1.2.stat<-table1.2.stat[[1]]
for(i in 2:10)
{average.table1.2.stat$mean<-average.table1.2.stat$mean+table1.2.stat[[i]]$mean
 average.table1.2.stat$std.dev<-average.table1.2.stat$std.dev+table1.2.stat[[i]]$std.dev
}
average.table1.2.stat<-average.table1.2.stat/10
table1.stat<-as.data.frame(c(table1.1.stat,average.table1.2.stat))

rownames(table1.stat)<-rownames(table1.1.stat)

stargazer(table1.stat,title="Descriptive Statistics of Pre-Imputed Data and Pos-Imputed Data",summary=FALSE,digits=2,nobs=FALSE,notes="a.N=3134,b.N=3877,c.N=4850,d.N=2706")


##############################################
##Table 2 Creation - summary of covariates we are matcing on
##Before and after matching using the first imputed data
##############################################
data.balanced.list<-load("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/DataList.RData")
data.balanced.cem.list<-load("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/WIP/DataCemList.RData")

table2.1.varlist<- c("Age","Male","herfinv1","lexperience","breadthexp01","link_coinv_diff_nuts_share1_b",
                      "residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out")


rowname2.1<-c("Age","Male","Experience Herfindahl","log Inventor Past Patents","Experience Breadth","Far Past Coinventors",
            "Country of Degree Dummy","Mobility in Region","Mobility out Region")

table2<-as.data.frame(c(0))
table2_mean1<-rep(0,9)
table2_mean2<-rep(0,9)
table2_sd1<-rep(0,9)
table2_sd2<-rep(0,9)
table2_mean3<-rep(0,9)
table2_mean4<-rep(0,9)
table2_sd3<-rep(0,9)
table2_sd4<-rep(0,9)

for(i in 1:10)
{
data<-data.balanced.list[[i]]
data1<-data[data$t==1,]
table2.1.1<-data1[table2.1.varlist]
data2<-data[data$t==0,]
table2.1.2<-data2[table2.1.varlist]
table2_mean1<-table2_mean1+t(stat.desc(table2.1.1))[,9]
table2_mean2<-table2_mean1+t(stat.desc(table2.1.2))[,9]
table2_sd1<-table2_sd1+t(stat.desc(table2.1.1))[,13]
table2_sd2<-table2_sd2+t(stat.desc(table2.1.2))[,13]

cem<-data.balanced.cem.list[[i]]
cem1<-cem[cem$t==1,]
table2.1.3<-cem1[table2.1.varlist]
cem2<-cem[cem$t==0,]
table2.1.4<-cem2[table2.1.varlist]


table2_mean3<-table2_mean3+t(stat.desc(table2.1.3))[,9]
table2_mean4<-table2_mean4+t(stat.desc(table2.1.4))[,9]
table2_sd3<-table2_sd3+t(stat.desc(table2.1.3))[,13]
table2_sd4<-table2_sd4+t(stat.desc(table2.1.4))[,13]
}

table2<-as.data.frame(t(rbind(table2_mean1,table2_mean2,table2_sd1,table2_sd2,table2_mean3,table2_mean4,table2_sd3,table2_sd4)))
table2<-table2/10
rownames(table2)<-rowname2.1
stargazer(table2,title="Descriptive Statistics of Unmatched Data and Matched Data",summary=FALSE,digits=2,nobs=FALSE)

#Calculate Imbalance
matching.varlist<- c("Age","Male","herfinv1","lexperience","breadthexp01","link_coinv_diff_nuts_share1_b",
                   "residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out","Location")

vec.iba.before <- names(data)
iba.before.varlist.drops <- vec.iba.before[!vec.iba.before%in% matching.varlist]
imbalance(group=data$t, data=data1,drop=iba.before.varlist.drops)#0.767

vec.iba.after <- names(cem)
iba.after.varlist.drops <- vec.iba.after[! vec.iba.after%in% matching.varlist]
imbalance(group=cem$t, data=cem,weights=cem$w,drop=iba.after.varlist.drops)#0.657


##############################################
##Table 3 - Regression tables of Science and Conference with unmatched data
##############################################

table3.varlist<-c("CE_dummy","DE_dummy","Age","Male","PhDDegree","link_coinv_diff_nuts_share1_b","no_pastcoinv",
                  "year_first_patent","residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out",
                  "herfinv1","zeroExp",
                  "lEmployeesMiss","RDint",
                  "Ninventors",
                  "lgdppop_nuts3","lpop_nuts3","larea_nuts3_km2","lav_pat_nuts3_9496","top1_tc","shangai_n_univ","overall_score")


rowname3<-c("Near","Far","Age","Male","PhD Degree","Far Past Coinventors","Coinventors Dummy",
            "Experience","Country of Degree Dummy","Mobility in Region","Mobility out Region",
            "Experience Herfindahl","Previous Patents Dummy",
            "log Employees","R&D Intensity",
            "Inventor Past Patents",
            "log GDPPC","log Pop","log Area","log Region Patents","TOP1% in Technology","NR Research Universities","Research University Score")



#run regression
s.sci.probit.1<-summary(sci.probit.1)
s.sci.probit.2<-summary(sci.probit.2)
s.conf.probit.1<-summary(conf.probit.1)
s.conf.probit.2<-summary(conf.probit.2)
output1<-as.data.frame(s.sci.probit.1$coefficient)
output2<-as.data.frame(s.sci.probit.2$coefficient)
output3<-as.data.frame(s.conf.probit.1$coefficient)
output4<-as.data.frame(s.conf.probit.2$coefficient)
print1<-output1[table3.varlist,]
print2<-output2[table3.varlist,]
print3<-output3[table3.varlist,]
print4<-output4[table3.varlist,]
table3.1<-as.data.frame(rbind(print1[,1],print2[,1],print3[,1],print4[,1]))
table3.1<-as.data.frame(t(table3.1))
rownames(table3.1)<-rowname3
stargazer(table3.1,summary=FALSE,title="Probit Estimation of Unmatched Data:Effect of PhD Degree on Conference",digits=3,nobs=FALSE)

v1<-rep("&(",27)
v2<-rep(")",27)
table3.2<-cbind(v1,round(print1[,2],3),v2,v1,round(print2[,2],3),v2,v1,round(print3[,2],3),v2,v1,round(print4[,2],3),v2)
table3.2<-as.data.frame(table3.2)
for (i in 1:27)
  {print(table3.2[i,],quote=FALSE)}

significant<-as.data.frame(t(rbind(print1[,4],print2[,4],print3[,4],print4[,4])))
rownames(significant)<-rowname3


###############################################
#Table 4 - regression of treatment on Science and Conference Using mathed data
#################################################
#run regression
s.sci.cem.probit.1<-summary(sci.cem.probit.1)
s.sci.cem.probit.2<-summary(sci.cem.probit.2)
s.conf.cem.probit.1<-summary(conf.cem.probit.1)
s.conf.cem.probit.2<-summary(conf.cem.probit.2)

output1<-as.data.frame(s.sci.cem.probit.1$coefficient)
output2<-as.data.frame(s.sci.cem.probit.2$coefficient)
output3<-as.data.frame(s.conf.cem.probit.1$coefficient)
output4<-as.data.frame(s.conf.cem.probit.2$coefficient)

print1<-output1[table3.varlist,]
print2<-output2[table3.varlist,]
print3<-output3[table3.varlist,]
print4<-output4[table3.varlist,]

table4.1<-as.data.frame(rbind(print1[,1],print2[,1],print3[,1],print4[,1]))
table4.1<-as.data.frame(t(table4.1))
rownames(table4.1)<-rowname3
colnames(table4.1)<-c("Model 1","Model 2","Model 3","Model 4")
stargazer(table4.1,summary=FALSE,title="Probit Estimation of Matched Data:Effect of PhD Degree on Conference",digits=3,nobs=FALSE)

v1<-rep("&(",27)
v2<-rep(")",27)
table4.2<-cbind(v1,round(print1[,2],3),v2,v1,round(print2[,2],3),v2,v1,round(print3[,2],3),v2,v1,round(print4[,2],3),v2)
table4.2<-as.data.frame(table4.2)

for (i in 1:23)
{print(table4.2[i,],quote=FALSE)}

significant<-as.data.frame(t(rbind(print1[,4],print2[,4],print3[,4],print4[,4])))
rownames(significant)<-rowname3

#################################################
#Table 5 - effect of phd on commercialization with both matched and unmatched data
#################################################
#run regression

table5.varlist<-c("CE_dummy","DE_dummy","Age","Male","PhDDegree","link_coinv_diff_nuts_share1_b","no_pastcoinv",
                  "year_first_patent","residence_degree","reg_mob_nuts3In","reg_mob_nuts3Out",
                  "herfinv1","zeroExp",
                  "lEmployeesMiss","RDint",
                  "Ninventors","reasonCommExploit","reasonLic","reasonImit",
                  "lgdppop_nuts3","lpop_nuts3","larea_nuts3_km2","lav_pat_nuts3_9496","top1_tc","shangai_n_univ","overall_score")


rowname5<-c("Near","Far","Age","Male","PhD Degree","Far Past Coinventors","Coinventors Dummy",
            "Experience","Country of Degree Dummy","Mobility in Region","Mobility out Region",
            "Experience Herfindahl","Previous Patents Dummy",
            "log Employees","R&D Intensity",
            "Inventor Past Patents","Commercial Expltitation","Licensing","Prevent Imitation",
            "log GDPPC","log Pop","log Area","log Region Patents","TOP1% in Technology","NR Research Universities","Research University Score")
s.com.cem.probit.1<-summary(com.cem.probit.1)
s.com.cem.probit.2<-summary(com.cem.probit.2)
s.com.cem.probit.3<-summary(com.cem.probit.3)
s.com.cem.probit.4<-summary(com.cem.probit.4)

output5.1<-as.data.frame(s.com.cem.probit.1$coefficient)
output5.2<-as.data.frame(s.com.cem.probit.2$coefficient)
output5.3<-as.data.frame(s.com.cem.probit.3$coefficient)
output5.4<-as.data.frame(s.com.cem.probit.4$coefficient)

print5.1<-output5.1[table5.varlist,]
print5.2<-output5.2[table5.varlist,]
print5.3<-output5.3[table5.varlist,]
print5.4<-output5.4[table5.varlist,]

table5.1<-as.data.frame(rbind(print5.1[,1],print5.2[,1],print5.3[,1],print5.4[,1]))
table5.1<-as.data.frame(t(table5.1))
rownames(table5.1)<-rowname5
colnames(table5.1)<-c("Model 1","Model 2","Model 3","Model 4") 
stargazer(table5.1,summary=FALSE,title="Probit Estimation of Matched Data:Effect of PhD Degree on Commercialization",digits=3,nobs=FALSE)

v1<-rep("&(",31)
v2<-rep(")",31)
table5.2<-cbind(v1,round(print5.1[,2],3),v2,v1,round(print5.2[,2],3),v2,v1,round(print5.3[,2],3),v2,v1,round(print5.4[,2],3),v2)
table5.2<-as.data.frame(table5.2)

for (i in 1:31)
{print(table5.2[i,],quote=FALSE)}

significant<-as.data.frame(t(rbind(print5.1[,4],print5.2[,4],print5.3[,4],print5.4[,4])))
rownames(significant)<-rowname5

#################################################
# Tables 6 and 7 - variable descreption
#################################################

table0<-read.csv("C:/Users/sdb/Dropbox/When Distance Disappears/R Files/variable descreption.csv")
stargazer(table0,summary=FALSE,title="Variable Descreption")
